#!/usr/bin/env python
# -*- coding: utf-8 -*-

import os, sys
import re

def smarten(text):
    def strip_rebuild (entities, text, STRIP):
        offset = 0
        PRE = 0
        output = []
        for entity in entities:
            if entity:
                check = re.split(r'([ >]*)', entity, flags=re.M|re.S)
                # Check if we're in a pre/code block
                # could add further exceptions here as well
                if check[0] in ['<pre', '<code']:
                    PRE += 1
                if entity in ['</pre>','</code>'] :
                    PRE += -1
                if STRIP:
                    # We only want the text if we're not in a pre/code block
                    if entity[0] != '<':
                        if PRE==0:
                            output.append(entity)
                else:
                    # when rebuilding, use the tags, and pre/code text,
                    # with the new smartened text added in.
                    if entity[0] != '<':
                        if PRE==0:
                            extras =len(re.findall(r'''['"]''', entity))*6
                            extras+=len(re.findall(r'(?<=[^-])---(?=[^-]|$)', entity))*4
                            extras+=len(re.findall(r'(?<=^)---(?=[^-])', entity))*4
                            extras+=len(re.findall(r'(?<=[^-])--(?=[^-]|$)', entity))*5
                            extras+=len(re.findall(r'(?<=^)--(?=[^-])', entity))*5
                            extras+=len(re.findall(r'\.\.\.', entity))*4
                            output.append(text[offset:offset+len(entity)+extras])
                            offset+=len(entity)+extras
                        else:
                            output.append(entity)
                    else:
                        output.append(entity)
        return ''.join(output)
    
    # Surround comments with <pre></pre> tags so we ignore them
    text = re.sub(r'(<!--.+?-->)',r'<pre>\1</pre>', text, flags=re.M|re.S)

    # Split the html into tags and text
    entities = re.split(r'(<.+?>)', text, flags=re.M|re.S)

    # Now get just the text
    text = strip_rebuild(entities, '', True)

    # Do the entities - Endash, emdash and ellipsis
    # -- = endash, --- = emdash
    text = re.sub(r'(?<=[^-])---(?=[^-])',r'&#8212;', text, flags=re.M) # mdash
    text = re.sub(r'(?<=[^-])--(?=[^-])', r'&#8211;', text, flags=re.M) # ndash
    text = re.sub(r'\.\.\.',              r'&#8230;', text) # hellip

    #Do the words beginning with an apostrophe - in apos_exceptions.txt file
    #One on each line - WITHOUT the apostrophe
    filepath = os.path.dirname(os.path.realpath(__file__))
    with open(os.path.join(filepath,'apos_exceptions.txt'),'r') as fd:
        apos_words_list = fd.read().split()
        for entry in apos_words_list:
            text = re.sub(r"'(%s\b)" % entry.strip("'"), r"&#8217;\1", text)

    # beginning or end of line
    text = re.sub(r'^"',     r'&#8220;', text, flags=re.M) # ld
    text = re.sub(r"^'",     r'&#8216;', text, flags=re.M) # ls
    text = re.sub(r'"$',     r'&#8221;', text, flags=re.M) # rd
    text = re.sub(r"'$",     r'&#8217;', text, flags=re.M) # rs
    
    # digits followed by quote becomes prime quote (seconds/minutes)
    text = re.sub(r'(?<=\d)"(?=\s|\d|[;,.!?])',     r'&#8243;', text) # ld
    text = re.sub(r"(?<=\d)'(?=\s|\d|[;,.!?])",     r'&#8242;', text) # ls

    # decade abbreviations (the '80s) - could expand for more digits?
    text = re.sub(r"""'(?=\d\d[s;,.!?]?\s)""",      r"""&#8217;""", text) # rs

    # close quote following an open brace/bracket/parenth
    text = re.sub(r"""(?<=[[({])'""",       r"""&#8216;""", text) # ls
    text = re.sub(r"""(?<=[[({])\"""",      r"""&#8220;""", text) # ld

    # any dash-space-quote-space -> closing quote
    # other isolated quotes will get changed to opening ones further down
    text = re.sub(r"""(?<=&#821[12];\s)"(?=\s)""",  r"""&#8221;""", text) # rd
    text = re.sub(r"""(?<=&#821[12];\s)'(?=\s)""",  r"""&#8217;""", text) # rs
    text = re.sub(r"""(?<=-\s)"(?=\s)""",           r"""&#8221;""", text) # rd
    text = re.sub(r"""(?<=-\s)'(?=\s)""",           r"""&#8217;""", text) # rs

    # double sets of quotes
    text = re.sub(r""""'(?=\w)""",          r"""&#8220;&#8216;""", text) # ld ls
    text = re.sub(r"""'"(?=\w)""",          r"""&#8216;&#8220;""", text) # ls ld
    text = re.sub(r'''""(?=\w)''',          r"""&#8220;&#8220;""", text) # ld ld
    text = re.sub(r"""''(?=\w)""",          r"""&#8216;&#8216;""", text) # ls ls
    text = re.sub(r'''"\'''',               r"""&#8221;&#8217;""", text) # rd rs
    text = re.sub(r''''"''',                r"""&#8217;&#8221;""", text) # rs rd
    text = re.sub(r'''""''',                r"""&#8221;&#8221;""", text) # rd rd
    text = re.sub(r"""''""",                r"""&#8217;&#8217;""", text) # rs rs

    # Quotes at inside of other entities
    text = re.sub(r"""(?<=\W)"(?=\w)""",    r"""&#8220;""", text) # ld
    text = re.sub(r"""(?<=\W)'(?=\w)""",    r"""&#8216;""", text) # ls
    text = re.sub(r"""(?<=\w)"(?=\W)""",    r"""&#8221;""", text) # rd
    text = re.sub(r"""(?<=\w)'(?=\W)""",    r"""&#8217;""", text) # rs
    text = re.sub(r"""(?<=\w)'(?=\w)""",    r"""&#8217;""", text) # rs
    
    # Any remaining single quotes
    text = re.sub(r"""(?<=\S)'""",          r"""&#8217;""", text)
    text = re.sub(r"""'""",                 r"""&#8216;""", text)

    # Any remaining double quotes
    text = re.sub(r"""(?<=\S)\"""",         r"""&#8221;""", text)
    text = re.sub(r'"',                     r"""&#8220;""", text)
    
    # Now rebuild the html
    text = strip_rebuild(entities, text, False)

    # remove <pre></pre> tags from comment blocks
    text = re.sub(r'<pre>(<!--.+?-->)</pre>',r'\1', text, flags=re.M|re.S)

    return text

def main(argv=sys.argv):
    if len(argv) < 2:
        print "Usage:"
        print "  smarten.py infile"
        return 1
    else:
        i=1
        while(i<len(argv)):  
            infile = argv[i]

            with open(infile, 'rU') as fd:
                html = fd.read()
              
            html = smarten(html)
        
            with open(infile,'w') as fd:
                fd.write(html)
                
            i=i+1
    
if __name__ == "__main__":
    sys.exit(main())
